“Una observación que se desvía mucho de otras observaciones y despierta sospechas de ser generada por un mecanismo diferente”
ozone.data<-read.csv("../bases de datos/ozone.csv",
stringsAsFactors = F)
names(ozone.data)
## [1] "Month" "Day_of_month"
## [3] "Day_of_week" "ozone_reading"
## [5] "pressure_height" "Wind_speed"
## [7] "Humidity" "Temperature_Sandburg"
## [9] "Temperature_ElMonte" "Inversion_base_height"
## [11] "Pressure_gradient" "Inversion_temperature"
## [13] "Visibility"
boxplot(ozone.data$pressure_height,
main = "Pressure Height",
boxwex = 0.5,col="blue")
#out
impute_outliers <- function(x, removeNA = TRUE){
quantiles <- quantile(x, c(0.05, 0.95), na.rm = removeNA)
x[x<quantiles[1]] <- mean(x, na.rm = removeNA)
x[x>quantiles[2]] <- median(x, na.rm = removeNA)
x
}
imputed_data <- impute_outliers(ozone.data$pressure_height)
par(mfrow = c(1,2))
boxplot(ozone.data$pressure_height, main = "Presión con outliers",
col = 3)
boxplot(imputed_data, main = "Presión sin outliers",col=2)
replace_outliers <- function(x, removeNA = TRUE){
qrts <- quantile(x, probs = c(0.25, 0.75), na.rm = removeNA)
caps <- quantile(x, probs = c(.05, .95), na.rm = removeNA)
iqr <- qrts[2]-qrts[1]
h <- 1.5 * iqr
x[x<qrts[1]-h] <- caps[1]
x[x>qrts[2]+h] <- caps[2]
x
}
capped_pressure_height <- replace_outliers(ozone.data$pressure_height)
par(mfrow = c(1,2))
boxplot(ozone.data$pressure_height, main = "Presión con outliers"
,col=5)
boxplot(capped_pressure_height, main = "Presión sin outliers",col=6)
with(cars, plot(x=speed, y=dist))
#Punto de balanceo
n <- dim(cars)[1]
mycars <- rbind(cars, c(60, 218.3649))
with(mycars, plot(x=speed, y=dist))
points(mycars[n + 1, ], pch=20, col='blue')
mod1 <- lm(dist ~ speed, data=cars);abline(mod1)
Con los elementos de la diagonal de la matriz H
\[H=X(X^TX)^ {-1}X^T\] Si \(h_{ii}>\frac{2p}{n}\) La observación \(i\) se considera un punto de balanceo.
mod2 <- lm(dist ~ speed, data=mycars)
lm.influence(mod2)$hat
## 1 2 3 4 5 6
## 0.06498630 0.06498630 0.04551521 0.04551521 0.04022961 0.03554640
## 7 8 9 10 11 12
## 0.03146556 0.03146556 0.03146556 0.02798710 0.02798710 0.02511103
## 13 14 15 16 17 18
## 0.02511103 0.02511103 0.02511103 0.02283733 0.02283733 0.02283733
## 19 20 21 22 23 24
## 0.02283733 0.02116602 0.02116602 0.02116602 0.02116602 0.02009709
## 25 26 27 28 29 30
## 0.02009709 0.02009709 0.01963054 0.01963054 0.01976637 0.01976637
## 31 32 33 34 35 36
## 0.01976637 0.02050458 0.02050458 0.02050458 0.02050458 0.02184518
## 37 38 39 40 41 42
## 0.02184518 0.02184518 0.02378815 0.02378815 0.02378815 0.02378815
## 43 44 45 46 47 48
## 0.02378815 0.02948124 0.03323136 0.03758386 0.03758386 0.03758386
## 49 50 51
## 0.03758386 0.04253874 0.59545970
plot(lm.influence(mod2)$hat)
mycars1 <- rbind(cars, c(25, 10))
with(mycars1, plot(x=speed, y=dist,ylim = c(0,200),xlim = c(0,50),
points(25,10,col=2,pch=20)))
mod3 <- lm(dist ~ speed, data=mycars1);abline(mod1);abline(mod3,col=4)
Una distancia de Cook grande significa que una observación tiene un peso grande en la estimación de \(\beta\) .
\[D_j=\frac{\sum_{j=1}^n (\hat{y}_j-\hat{y}_{j(i)})}{p \hat{\sigma}^2}\] Cuando \(D_j=\frac{4}{n-p-2}\) se considera que la observación es un punto de influencia.
require(olsrr) # graficos
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'olsrr'
require(MPV) # cook
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'MPV'
softdrink <- MPV::softdrink
colnames(softdrink) <- c('tiempo', 'cantidad', 'distancia')
mod <- lm(tiempo ~ cantidad + distancia, data=softdrink)
cutoff <- 4/(25-3-2)
plot(mod, which=4, cook.levels=cutoff)
abline(h=cutoff, col='lightpink', lty='dashed')
Se puede investigar la influencia de eliminar la i-ésima observación sobre el valor predicho o ajustado.
\[DFFITS=\frac{\hat{y}_i-\hat{y}_{i(i)}}{S_i\sqrt(p/n)}\]
Se sugiere que merece investigarse toda observación con \(|DFFITS|>2\sqrt(p/n)\)
olsrr::ols_plot_dffits(mod)